import logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s: %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S%z",
)
# common libs
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px #for visualization
from scipy.cluster import hierarchy
# extra utils
import missingno as msno # a library to analyze missing data
#inline plot
%matplotlib inline
# plotting backend
pd.options.plotting.backend = "matplotlib"
# set FutureWarnings off
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
from sklearn.exceptions import ConvergenceWarning
# warnings.filterwarnings('ignore', category=ConvergenceWarning) #sklearn.exceptions.ConvergenceWarning
2024-04-23T14:23:34+0300 INFO: NumExpr defaulting to 8 threads.
df = pd.read_csv("obesity.data.txt", sep="\t")
df.sample(20)
| Gender | Age | Height | Weight | FHO | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | WeightClass | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1105 | Female | 26.000000 | 1.640125 | NaN | yes | yes | 3.000000 | 3.000000 | Sometimes | no | NaN | no | 0.000000 | 0.162494 | NaN | NaN | 4 |
| 387 | Male | 39.759575 | NaN | 101.780099 | yes | yes | NaN | NaN | Sometimes | no | NaN | no | 2.998981 | 1.000000 | NaN | Automobile | 4 |
| 1688 | Female | 23.664709 | 1.738836 | 133.472641 | yes | yes | 3.000000 | 3.000000 | Sometimes | no | NaN | NaN | 1.026452 | NaN | Sometimes | Public_Transportation | 4 |
| 1077 | Male | 18.000000 | 1.708107 | 51.314659 | yes | yes | 1.303878 | 3.000000 | Sometimes | no | 1.755497 | no | 0.062932 | NaN | Sometimes | NaN | 1 |
| 1195 | Female | NaN | NaN | NaN | yes | no | NaN | NaN | Frequently | no | 1.000000 | no | 0.000000 | 1.000000 | no | Automobile | 2 |
| 1755 | Female | 16.370009 | 1.613921 | NaN | yes | NaN | 2.206399 | 1.000000 | NaN | no | 2.000000 | yes | 0.948930 | 1.772463 | no | Public_Transportation | 3 |
| 1234 | Female | 34.204408 | 1.664927 | 80.386078 | NaN | yes | NaN | 3.000000 | Sometimes | no | 2.641642 | no | NaN | 1.503010 | no | Automobile | 3 |
| 1547 | Male | 31.783845 | 1.750000 | 120.000000 | yes | yes | 2.941627 | 3.000000 | Sometimes | no | 2.318134 | no | 0.582686 | NaN | Sometimes | Automobile | 4 |
| 1191 | Male | NaN | NaN | 105.254354 | yes | yes | NaN | 2.036794 | Sometimes | no | NaN | no | 1.062011 | 1.771135 | NaN | NaN | 4 |
| 151 | Female | 33.732714 | 1.679725 | NaN | yes | yes | 3.000000 | NaN | Sometimes | NaN | NaN | no | 0.261274 | NaN | Sometimes | Automobile | 3 |
| 1771 | Male | 21.379676 | 1.701413 | 96.710735 | yes | yes | 2.000000 | 2.961192 | Sometimes | no | NaN | no | 0.981686 | 1.355370 | no | Public_Transportation | 4 |
| 723 | Male | 17.000000 | 1.690000 | NaN | no | NaN | NaN | 3.000000 | NaN | no | NaN | NaN | 2.000000 | 1.000000 | Sometimes | Walking | 2 |
| 900 | Male | NaN | NaN | NaN | yes | yes | 2.935157 | 1.845858 | NaN | no | 1.000000 | no | 1.089891 | 0.715993 | no | Public_Transportation | 4 |
| 63 | Female | 23.000000 | 1.630000 | 83.000000 | NaN | no | 3.000000 | NaN | Sometimes | yes | 3.000000 | no | NaN | 2.000000 | NaN | Public_Transportation | 4 |
| 964 | Female | 22.679454 | 1.628260 | 82.967937 | yes | NaN | NaN | 1.000000 | NaN | no | 2.269500 | no | NaN | NaN | NaN | Public_Transportation | 4 |
| 824 | Male | 25.526746 | 1.783381 | NaN | yes | yes | 2.191429 | NaN | NaN | no | 2.102709 | no | 1.325340 | 0.380979 | Sometimes | Public_Transportation | 4 |
| 128 | Male | 23.562135 | NaN | 75.371244 | yes | yes | 2.000000 | 3.000000 | Sometimes | no | NaN | no | 0.121585 | NaN | Sometimes | Public_Transportation | 3 |
| 1668 | Male | 20.000000 | 1.650000 | 80.000000 | NaN | no | 2.000000 | 3.000000 | Sometimes | NaN | 2.000000 | no | 1.000000 | 2.000000 | NaN | NaN | 3 |
| 578 | Male | 23.000000 | 1.742500 | 105.028665 | yes | yes | 2.393837 | 3.000000 | Sometimes | no | 2.014990 | no | 0.978815 | 0.413220 | Sometimes | Public_Transportation | 4 |
| 2079 | Male | 21.000000 | 1.620000 | 70.000000 | no | yes | 2.000000 | 1.000000 | no | NaN | 3.000000 | no | NaN | 0.000000 | Sometimes | NaN | 3 |
numeric_vars = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
categoric_vars = ['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', "WeightClass"]
df.select_dtypes(include=[np.number]).columns
Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
'WeightClass'],
dtype='object')
# does not include missing values
df[numeric_vars].describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Age | 1573.0 | 24.253224 | 6.276876 | 14.00 | 19.920629 | 22.693989 | 26.000000 | 55.137881 |
| Height | 1465.0 | 1.701895 | 0.093146 | 1.45 | 1.629010 | 1.702825 | 1.767186 | 1.980000 |
| Weight | 1514.0 | 86.366451 | 26.468326 | 39.00 | 65.238481 | 82.353453 | 107.009192 | 173.000000 |
| FCVC | 1676.0 | 2.417900 | 0.535480 | 1.00 | 2.000000 | 2.348344 | 3.000000 | 3.000000 |
| NCP | 1632.0 | 2.691339 | 0.778604 | 1.00 | 2.690432 | 3.000000 | 3.000000 | 4.000000 |
| CH2O | 1542.0 | 2.007783 | 0.608349 | 1.00 | 1.591468 | 2.000000 | 2.466337 | 3.000000 |
| FAF | 1504.0 | 1.020704 | 0.853450 | 0.00 | 0.143955 | 1.000000 | 1.677413 | 3.000000 |
| TUE | 1536.0 | 0.650987 | 0.602905 | 0.00 | 0.000000 | 0.619123 | 1.000000 | 2.000000 |
cols = ['Height', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
pd.options.plotting.backend = "plotly"
df[cols].boxplot(title="Box Plots")
df[["Age"]].boxplot(title="Box Plot Age")
df[["Weight"]].boxplot(title="Box Plot Weight")
pd.options.plotting.backend = "matplotlib"
for col in df.columns:
if col not in numeric_vars:
print(f"Unique values for {col}: {df[col].unique()}\n")
Unique values for Gender: ['Female' 'Male'] Unique values for FHO: [nan 'yes' 'no'] Unique values for FAVC: ['no' nan 'yes'] Unique values for CAEC: [nan 'Sometimes' 'Always' 'Frequently' 'no'] Unique values for SMOKE: ['no' nan 'yes'] Unique values for SCC: ['no' nan 'yes'] Unique values for CALC: ['no' 'Sometimes' nan 'Frequently' 'Always'] Unique values for MTRANS: ['Public_Transportation' 'Automobile' nan 'Walking' 'Motorbike' 'Bike'] Unique values for WeightClass: [1 2 4 3]
df['FCVC'].nunique()
645
df['NCP'].nunique()
489
df['FAF'].nunique()
863
df['TUE'].nunique()
825
FCVC, NCP, FAF and TUE are features which could have been categorical instead of numerical. Looking at the sample initial questionnaire, the answers to those questions could only be one of the few (<5) options provided in the survey. The original response was unbalanced (fig 1) in favor of the normal weight class. Authors found useful to use SMOTE algorithm to oversample the data (fig 2).
Fig 1: Original unbalanced distribution [1]
Fig 2: Oversampled balanced distribution [1]
At least in python, there are variants of the default SMOTE that can handle both continuous and categorical features. My guess is that the features mentioned above were encoded as numeric features during oversampling which led to the intermediate values observed.
It is unlikely that we want the intermediate values as it is impossible to get real world data with such values. This should be considered in model building.
References
[1]F. M. Palechor and A. de la H. Manotas, “Dataset for estimation of obesity levels based on eating habits and physical condition in individuals from Colombia, Peru and Mexico,” Data in Brief, vol. 25, p. 104344, Aug. 2019, doi: https://doi.org/10.1016/j.dib.2019.104344.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2111 entries, 0 to 2110 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 2111 non-null object 1 Age 1573 non-null float64 2 Height 1465 non-null float64 3 Weight 1514 non-null float64 4 FHO 1483 non-null object 5 FAVC 1565 non-null object 6 FCVC 1676 non-null float64 7 NCP 1632 non-null float64 8 CAEC 1458 non-null object 9 SMOKE 1609 non-null object 10 CH2O 1542 non-null float64 11 SCC 1475 non-null object 12 FAF 1504 non-null float64 13 TUE 1536 non-null float64 14 CALC 1610 non-null object 15 MTRANS 1502 non-null object 16 WeightClass 2111 non-null int64 dtypes: float64(8), int64(1), object(8) memory usage: 280.5+ KB
# percent of missing values per variable
df.isna().sum() * 100 / df.shape[0]
Gender 0.000000 Age 25.485552 Height 30.601611 Weight 28.280436 FHO 29.748934 FAVC 25.864519 FCVC 20.606348 NCP 22.690668 CAEC 30.933207 SMOKE 23.780199 CH2O 26.954050 SCC 30.127901 FAF 28.754145 TUE 27.238276 CALC 23.732828 MTRANS 28.848887 WeightClass 0.000000 dtype: float64
# barplot of missing values per column
msno.bar(df, color='steelblue');
MCAR: Missing completely at random. The probability of missingness is random meaning independent of data observed or missed. Example of data missing due to equipment/sensoric failures.
MAR: Missing at random. The probability of missingness depends on the observed data but not on the missing values. This means we can explain the missing values by looking at the data for which there exist complete information. The is some pattern in the sub-samples for which data is missing.
MNAR: Missing not at random. The probability of missingness is also related to the unobserved (missed) values in the dataset.
While it is acceptable to ignore rows/columns with missing values in case of MCAR, doing so in case of MAR and MNAR could potentially introduce bias in the data distribution hence it is advised to work on imputing missing data.
References
[1] D. B. RUBIN, “Inference and missing data,” Biometrika, vol. 63, no. 3, pp. 581–592, 1976, doi: https://doi.org/10.1093/biomet/63.3.581.
df_null = df.iloc[:, [i for i, n in enumerate(np.var(df.isnull(), axis='rows')) if n > 0]]
corr_mat = df_null.isnull().corr()
px.imshow(corr_mat, width=700, height=500,color_continuous_scale='magenta', title='Nullity Correlation between variables')
Nullity correlation heatmap. It shows how strongly the presence or absence of one feature affects the presence of another.
corr_mat[corr_mat >= 0.05] # 5% positive threshold
| Age | Height | Weight | FHO | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Height | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Weight | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| FHO | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| FAVC | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| FCVC | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| NCP | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| CAEC | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| SMOKE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| CH2O | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN |
| SCC | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN |
| FAF | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN |
| TUE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN |
| CALC | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN |
| MTRANS | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
corr_mat[corr_mat <= -0.05] # 5% negative threshold
| Age | Height | Weight | FHO | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Height | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Weight | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| FHO | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -0.05757 |
| FAVC | NaN | NaN | NaN | NaN | NaN | NaN | -0.051377 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| FCVC | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| NCP | NaN | NaN | NaN | NaN | -0.051377 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| CAEC | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| SMOKE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| CH2O | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| SCC | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| FAF | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| TUE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| CALC | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| MTRANS | NaN | NaN | NaN | -0.05757 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Looking at the nullity correlation values between features, in 5% of the cases, if FHO is present, MTRANS is not; similarly if NCP is present FAVC is not.
Those values are low to allow concluding any type of absolute nullity correlation.
x = np.transpose(df.drop(['WeightClass', 'Gender'], axis=1).isnull().astype(bool).values)
#hamming = proportion of elements disagrees
# jaccard dissimilirity = 1 - Jaccard index = correlation
z = hierarchy.linkage(x, method='complete', metric="correlation")
fig = plt.figure(figsize=(25, 10))
hierarchy.dendrogram(z, labels=df.drop(['WeightClass', 'Gender'], axis=1).columns, orientation='top', distance_sort='descending');
plt.ylabel('Average distance')
plt.xlabel('Clusters')
plt.show()
Looking at the correlation matrix, I couldn't find any pairwise nullity similarity between features so in the figure above, I tried to check for deeper nullity correlation between the variables.
Hierarchical clustering is used to bins features against one another with regards to their nullity correlation (two features with same nullity bits are closer than 2 features with different nullity bits).
In each step, the clustering algorithm splits up the features by minimizing the distance among clusters. If the set of variables were identical, their total dissimilarity distance will be close to zero hence their average distance in y axis will also be close to zero.
From the dendrogram, we can see that SMOKE and CAEC have the same correlation, same for the set of SCC, FCVC, Age, FAVC and the set of MTRANS, Height, CALC. While the features are groupped into clusters, the co-absence dissimilarity is too high making it difficult to conclude on any co-nullity pattern among variables or groupes of variables.
At this step, we do not have evidence to point out to a specific type of missingness.
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression, BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.base import TransformerMixin
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import precision_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
N_SPLITS = 5
def score_fn_cv(clf, df, target="WeightClass"):
skf = StratifiedKFold(n_splits=N_SPLITS)
features = df.drop(target, axis=1)
labels = df[target]
scores = cross_val_score(clf, features, labels, cv=skf, scoring='f1_micro')
return scores.mean(), scores.std()
NUM_FEATURES = ["Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]
CAT_FEATURES = ["Gender", "FHO", "FAVC", "CAEC", "SMOKE", "SCC", "CALC", "MTRANS"]
FEATURES = NUM_FEATURES + CAT_FEATURES
def end2end_pipeline(num_imputer, cat_imputer, num_features=NUM_FEATURES, cat_features=CAT_FEATURES):
numerical_transformer = Pipeline(
steps=[('encoder', MinMaxScaler()), ("imputer", num_imputer)]
)
categorical_transformer = Pipeline(
steps=[("ordinal_encode", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)), ("imputer", cat_imputer), ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))]
)
#
preprocessor = ColumnTransformer(transformers=[
('num', numerical_transformer, num_features),
('cat', categorical_transformer, cat_features)
])
clf = Pipeline(
steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=2000))]
)
return clf, preprocessor
scores = {} # to store mean of f1 scores
stds = {} # to store std of f1 scores
df
| Gender | Age | Height | Weight | FHO | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | WeightClass | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 23.501249 | 1.600000 | NaN | NaN | no | NaN | 3.0 | NaN | no | 2.074048 | no | NaN | NaN | no | Public_Transportation | 1 |
| 1 | Male | 25.000000 | 1.790000 | 72.000000 | NaN | NaN | 2.000000 | 3.0 | Sometimes | no | 2.000000 | no | 1.000000 | NaN | Sometimes | Public_Transportation | 2 |
| 2 | Male | 18.274358 | 1.824655 | 58.621349 | NaN | yes | 2.140840 | 4.0 | Sometimes | no | 2.931438 | NaN | 2.000000 | 1.164457 | no | Automobile | 1 |
| 3 | Female | 26.000000 | 1.643892 | 111.884535 | yes | yes | 3.000000 | NaN | Sometimes | no | 2.768141 | no | NaN | NaN | Sometimes | Public_Transportation | 4 |
| 4 | Male | NaN | 1.850000 | 115.000000 | no | NaN | NaN | NaN | Sometimes | NaN | 3.000000 | yes | 1.000000 | NaN | no | NaN | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2106 | Male | 26.015448 | 1.829907 | 105.436173 | NaN | yes | 3.000000 | 3.0 | NaN | no | 2.224914 | no | 1.781589 | NaN | NaN | Public_Transportation | 4 |
| 2107 | Male | 26.774115 | 1.755938 | 112.287678 | NaN | yes | 1.428289 | NaN | Sometimes | no | NaN | no | 0.485322 | NaN | Sometimes | Automobile | 4 |
| 2108 | Male | 23.000000 | NaN | 66.000000 | no | no | 3.000000 | 3.0 | NaN | no | 2.000000 | no | 3.000000 | 0.000000 | NaN | Public_Transportation | 2 |
| 2109 | Female | 23.803904 | 1.581527 | 78.089575 | yes | yes | 2.000000 | 1.0 | Sometimes | no | 2.000000 | no | NaN | 0.000000 | no | Public_Transportation | 4 |
| 2110 | Male | NaN | 1.766975 | 118.363376 | yes | NaN | 2.964319 | 3.0 | Sometimes | no | NaN | no | NaN | 1.875023 | Sometimes | Automobile | 4 |
2111 rows × 17 columns
print(f"Percent of rows with at least one missing value is {round(df.isnull().any(axis = 1).sum() / df.shape[0] * 100, 2)}%")
Percent of rows with at least one missing value is 98.96%
This makes it impossible to drop missing values from rows as we won't have enough data for training
clf, preprocessor = end2end_pipeline(
num_imputer=SimpleImputer(strategy="mean", missing_values=np.nan),
cat_imputer=SimpleImputer(strategy="most_frequent", missing_values=np.nan)
)
mean_f1, std_f1 = score_fn_cv(df=df.copy(), clf=clf, target="WeightClass")
scores["Simple Imputation Mean/Mod"] = mean_f1
stds["Simple Imputation Mean/Mod"] = std_f1
print(f"Simple Imputation Mean/Mod: F1 score = {100*scores['Simple Imputation Mean/Mod']:.2f}%")
clf
Simple Imputation Mean/Mod: F1 score = 68.45%
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder',
MinMaxScaler()),
('imputer',
SimpleImputer())]),
['Age', 'Height', 'Weight',
'FCVC', 'NCP', 'CH2O', 'FAF',
'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
SimpleImputer(strategy='most_frequent')),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC',
'CAEC', 'SMOKE', 'SCC',
'CALC', 'MTRANS'])])),
('classifier', LogisticRegression(max_iter=2000))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder',
MinMaxScaler()),
('imputer',
SimpleImputer())]),
['Age', 'Height', 'Weight',
'FCVC', 'NCP', 'CH2O', 'FAF',
'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
SimpleImputer(strategy='most_frequent')),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC',
'CAEC', 'SMOKE', 'SCC',
'CALC', 'MTRANS'])])),
('classifier', LogisticRegression(max_iter=2000))])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder', MinMaxScaler()),
('imputer', SimpleImputer())]),
['Age', 'Height', 'Weight', 'FCVC', 'NCP',
'CH2O', 'FAF', 'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
SimpleImputer(strategy='most_frequent')),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE',
'SCC', 'CALC', 'MTRANS'])])['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
MinMaxScaler()
SimpleImputer()
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=nan)
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='first', handle_unknown='ignore')
LogisticRegression(max_iter=2000)
pd.DataFrame(preprocessor.fit_transform(df.drop("WeightClass", axis=1)), columns=preprocessor.get_feature_names_out()).sample(20)
| num__Age | num__Height | num__Weight | num__FCVC | num__NCP | num__CH2O | num__FAF | num__TUE | cat__Gender_1.0 | cat__FHO_1.0 | ... | cat__CAEC_3.0 | cat__SMOKE_1.0 | cat__SCC_1.0 | cat__CALC_1.0 | cat__CALC_2.0 | cat__CALC_3.0 | cat__MTRANS_1.0 | cat__MTRANS_2.0 | cat__MTRANS_3.0 | cat__MTRANS_4.0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1132 | 0.249240 | 0.421419 | 0.380597 | 0.595055 | 0.343286 | 0.503891 | 0.648969 | 0.000000 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1698 | 0.249240 | 0.415094 | 0.082090 | 0.708950 | 0.666667 | 1.000000 | 0.666667 | 0.500000 | 0.0 | 1.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1983 | 0.100584 | 0.697600 | 0.353481 | 1.000000 | 0.666667 | 0.577963 | 0.666612 | 0.325494 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1339 | 0.243085 | 0.475274 | 0.353481 | 1.000000 | 0.666667 | 0.503891 | 0.333333 | 0.000000 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1727 | 0.596133 | 0.595917 | 0.593532 | 0.588948 | 0.662551 | 0.372548 | 0.218849 | 0.000000 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 622 | 0.249240 | 0.475274 | 0.092853 | 0.500000 | 0.666667 | 0.114958 | 0.206511 | 1.000000 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 2107 | 0.310520 | 0.577242 | 0.546923 | 0.214144 | 0.563780 | 0.503891 | 0.161774 | 0.325494 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1781 | 0.249240 | 0.566038 | 0.573186 | 0.708950 | 0.563780 | 0.071322 | 0.171408 | 0.325494 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1703 | 0.218776 | 0.495389 | 0.272245 | 1.000000 | 0.884687 | 0.114958 | 0.264496 | 0.798804 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1255 | 0.173357 | 0.546145 | 0.831176 | 1.000000 | 0.666667 | 0.503891 | 0.527558 | 0.340373 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 706 | 0.249240 | 0.471698 | 0.353481 | 0.500000 | 0.563780 | 0.500000 | 0.000000 | 0.500000 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 125 | 0.144650 | 0.475274 | 0.022388 | 0.862641 | 0.086601 | 0.000000 | 0.458223 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 767 | 0.097234 | 0.625553 | 0.088580 | 0.708950 | 0.666667 | 0.500000 | 0.173469 | 0.325494 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1429 | 0.218485 | 0.675981 | 0.353481 | 0.481982 | 0.278742 | 0.427685 | 0.340235 | 0.833873 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 405 | 0.249240 | 0.578319 | 0.584841 | 0.225669 | 0.563780 | 0.500000 | 0.461536 | 0.325494 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 824 | 0.280198 | 0.629021 | 0.353481 | 0.595714 | 0.563780 | 0.551354 | 0.441780 | 0.190490 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1790 | 0.097234 | 0.475274 | 0.353481 | 0.500000 | 0.380613 | 0.969923 | 0.340235 | 0.051528 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1824 | 0.249240 | 0.165604 | 0.022388 | 0.708950 | 0.000000 | 0.503891 | 0.000000 | 0.233085 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1311 | 0.383858 | 0.477957 | 0.474568 | 0.827396 | 0.563780 | 0.172704 | 0.109139 | 0.193431 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1211 | 0.579343 | 0.303523 | 0.353481 | 0.518307 | 0.272327 | 0.503891 | 0.340235 | 0.000000 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
20 rows × 23 columns
clf, preprocessor = end2end_pipeline(
num_imputer=SimpleImputer(strategy="mean", add_indicator=True),
cat_imputer=SimpleImputer(strategy="most_frequent", add_indicator=True)
)
mean_f1, std_f1 = score_fn_cv(df=df.copy(), clf=clf, target="WeightClass")
scores["Simple Imputation Mean/Mod with missingness"] = mean_f1
stds["Simple Imputation Mean/Mod with missingness"] = std_f1
print(f"Simple Imputation Mean/Mod: F1 score = {100*scores['Simple Imputation Mean/Mod with missingness']:.2f}%")
clf
Simple Imputation Mean/Mod: F1 score = 70.16%
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder',
MinMaxScaler()),
('imputer',
SimpleImputer(add_indicator=True))]),
['Age', 'Height', 'Weight',
'FCVC', 'NCP', 'CH2O', 'FAF',
'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
SimpleImputer(add_indicator=True,
strategy='most_frequent')),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC',
'CAEC', 'SMOKE', 'SCC',
'CALC', 'MTRANS'])])),
('classifier', LogisticRegression(max_iter=2000))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder',
MinMaxScaler()),
('imputer',
SimpleImputer(add_indicator=True))]),
['Age', 'Height', 'Weight',
'FCVC', 'NCP', 'CH2O', 'FAF',
'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
SimpleImputer(add_indicator=True,
strategy='most_frequent')),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC',
'CAEC', 'SMOKE', 'SCC',
'CALC', 'MTRANS'])])),
('classifier', LogisticRegression(max_iter=2000))])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder', MinMaxScaler()),
('imputer',
SimpleImputer(add_indicator=True))]),
['Age', 'Height', 'Weight', 'FCVC', 'NCP',
'CH2O', 'FAF', 'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
SimpleImputer(add_indicator=True,
strategy='most_frequent')),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE',
'SCC', 'CALC', 'MTRANS'])])['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
MinMaxScaler()
SimpleImputer(add_indicator=True)
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=nan)
SimpleImputer(add_indicator=True, strategy='most_frequent')
OneHotEncoder(drop='first', handle_unknown='ignore')
LogisticRegression(max_iter=2000)
pd.DataFrame(preprocessor.fit_transform(df.drop("WeightClass", axis=1)), columns=preprocessor.get_feature_names_out()).sample(20)
| num__Age | num__Height | num__Weight | num__FCVC | num__NCP | num__CH2O | num__FAF | num__TUE | num__missingindicator_Age | num__missingindicator_Height | ... | cat__MTRANS_2.0 | cat__MTRANS_3.0 | cat__MTRANS_4.0 | cat__missingindicator_FHO_1.0 | cat__missingindicator_FAVC_1.0 | cat__missingindicator_CAEC_1.0 | cat__missingindicator_SMOKE_1.0 | cat__missingindicator_SCC_1.0 | cat__missingindicator_CALC_1.0 | cat__missingindicator_MTRANS_1.0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 432 | 0.137050 | 0.677549 | 0.343284 | 1.000000 | 0.563780 | 0.614586 | 0.535984 | 0.314030 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1419 | 0.170159 | 0.566038 | 0.365672 | 0.500000 | 0.666667 | 0.503891 | 0.340235 | 0.325494 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1477 | 0.194468 | 0.547170 | 0.268657 | 1.000000 | 0.666667 | 0.000000 | 0.340235 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 2021 | 0.249240 | 0.475274 | 0.364204 | 0.877323 | 0.666667 | 0.978911 | 0.747585 | 0.325494 | 1.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 740 | 0.179673 | 0.529502 | 0.693303 | 1.000000 | 0.666667 | 0.503891 | 0.566963 | 0.325494 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| 1744 | 0.316010 | 0.471698 | 0.353481 | 0.708950 | 0.666667 | 1.000000 | 0.333333 | 0.325494 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 45 | 0.194468 | 0.245283 | 0.141791 | 0.500000 | 0.000000 | 0.503891 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 665 | 0.429356 | 0.752764 | 0.353481 | 0.699766 | 0.541026 | 0.767564 | 0.340235 | 0.303132 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1085 | 0.170582 | 0.570549 | 0.299326 | 0.708950 | 0.000000 | 0.500000 | 0.857410 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1957 | 0.434367 | 0.566038 | 0.590320 | 0.569598 | 0.666667 | 0.524215 | 0.340235 | 0.180493 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 1447 | 0.249240 | 0.068838 | 0.353481 | 0.500000 | 0.034881 | 0.503891 | 0.000000 | 0.484548 | 1.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 257 | 0.249240 | 0.475274 | 0.216418 | 0.708950 | 1.000000 | 0.503891 | 0.340235 | 0.500000 | 1.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1158 | 0.249240 | 0.471698 | 0.291353 | 0.708950 | 0.666667 | 0.582802 | 0.285324 | 0.419830 | 1.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 92 | 0.249240 | 0.333170 | 0.492815 | 1.000000 | 0.666667 | 0.503891 | 0.002671 | 0.251948 | 1.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 998 | 0.090669 | 0.475274 | 0.091168 | 0.525642 | 0.548619 | 0.678760 | 0.340235 | 0.448971 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 294 | 0.218776 | 0.660377 | 0.283582 | 0.500000 | 0.666667 | 0.500000 | 0.666667 | 0.325494 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| 1183 | 0.246708 | 0.707360 | 0.610490 | 0.612575 | 0.563780 | 0.678989 | 0.647914 | 0.341064 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 537 | 0.291702 | 0.207547 | 0.470149 | 1.000000 | 0.666667 | 0.000000 | 0.000000 | 0.500000 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 1742 | 0.132036 | 0.475274 | 0.353481 | 0.921728 | 0.563780 | 0.503891 | 0.340235 | 0.325494 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 350 | 0.123763 | 0.475274 | 0.353481 | 1.000000 | 0.666667 | 0.503891 | 0.340235 | 0.000000 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
20 rows × 38 columns
### KNN Imputation
cat_pipe = Pipeline(
steps=[("imputer", KNNImputer(n_neighbors=1, add_indicator=True))]
)
clf, preprocessor = end2end_pipeline(
num_imputer=KNNImputer(n_neighbors=15, add_indicator=True),
cat_imputer=cat_pipe
)
mean_f1, std_f1 = score_fn_cv(df=df.copy(), clf=clf, target="WeightClass")
scores["KNN Imputation"] = mean_f1
stds["KNN Imputation"] = std_f1
print(f"KNN Imputation: F1 score = {100*scores['KNN Imputation']:.2f}%")
clf
KNN Imputation: F1 score = 70.96%
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder',
MinMaxScaler()),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=15))]),
['Age', 'Height', 'Weight',
'FCVC', 'NCP', 'CH2O', 'FAF',
'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
Pipeline(steps=[('imputer',
KNNImputer(add_indicator=True,
n_neighbors=1))])),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC',
'CAEC', 'SMOKE', 'SCC',
'CALC', 'MTRANS'])])),
('classifier', LogisticRegression(max_iter=2000))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder',
MinMaxScaler()),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=15))]),
['Age', 'Height', 'Weight',
'FCVC', 'NCP', 'CH2O', 'FAF',
'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
Pipeline(steps=[('imputer',
KNNImputer(add_indicator=True,
n_neighbors=1))])),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC',
'CAEC', 'SMOKE', 'SCC',
'CALC', 'MTRANS'])])),
('classifier', LogisticRegression(max_iter=2000))])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder', MinMaxScaler()),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=15))]),
['Age', 'Height', 'Weight', 'FCVC', 'NCP',
'CH2O', 'FAF', 'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
Pipeline(steps=[('imputer',
KNNImputer(add_indicator=True,
n_neighbors=1))])),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE',
'SCC', 'CALC', 'MTRANS'])])['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
MinMaxScaler()
KNNImputer(add_indicator=True, n_neighbors=15)
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=nan)
Pipeline(steps=[('imputer', KNNImputer(add_indicator=True, n_neighbors=1))])KNNImputer(add_indicator=True, n_neighbors=1)
OneHotEncoder(drop='first', handle_unknown='ignore')
LogisticRegression(max_iter=2000)
pd.DataFrame(preprocessor.fit_transform(df.drop("WeightClass", axis=1)), columns=preprocessor.get_feature_names_out()).sample(20)
| num__Age | num__Height | num__Weight | num__FCVC | num__NCP | num__CH2O | num__FAF | num__TUE | num__missingindicator_Age | num__missingindicator_Height | ... | cat__MTRANS_2.0 | cat__MTRANS_3.0 | cat__MTRANS_4.0 | cat__missingindicator_FHO_1.0 | cat__missingindicator_FAVC_1.0 | cat__missingindicator_CAEC_1.0 | cat__missingindicator_SMOKE_1.0 | cat__missingindicator_SCC_1.0 | cat__missingindicator_CALC_1.0 | cat__missingindicator_MTRANS_1.0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 411 | 0.108673 | 0.766804 | 0.146821 | 0.671110 | 0.979033 | 0.655895 | 0.671126 | 0.564177 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| 483 | 0.284835 | 0.427875 | 0.386059 | 0.756012 | 0.510493 | 0.139722 | 0.037477 | 0.567751 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| 409 | 0.479423 | 0.496047 | 0.479672 | 0.707583 | 0.535487 | 0.338062 | 0.556787 | 0.107292 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1721 | 0.139820 | 0.427068 | 0.075411 | 0.860351 | 0.666667 | 0.500000 | 0.620745 | 0.500000 | 1.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1569 | 0.227535 | 0.303108 | 0.460017 | 0.998762 | 0.051439 | 0.119028 | 0.376433 | 0.306181 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 |
| 43 | 0.237222 | 0.441622 | 0.344748 | 0.729988 | 0.769487 | 0.480228 | 0.593215 | 0.409517 | 1.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1169 | 0.248813 | 0.384178 | 0.471154 | 1.000000 | 0.666667 | 0.529619 | 0.006687 | 0.347919 | 1.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 270 | 0.349829 | 0.679358 | 0.360349 | 0.716593 | 0.666667 | 0.102816 | 0.666667 | 0.195910 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 827 | 0.145851 | 0.377358 | 0.141791 | 0.500000 | 0.666667 | 0.500000 | 1.000000 | 0.500000 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| 2070 | 0.232524 | 0.639892 | 0.417572 | 1.000000 | 0.666667 | 0.916885 | 0.268807 | 0.337561 | 1.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 525 | 0.260571 | 0.435766 | 0.380597 | 1.000000 | 0.562927 | 0.000000 | 0.412856 | 0.000000 | 1.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 1578 | 0.099837 | 0.515592 | 0.351780 | 0.449897 | 0.666667 | 0.219981 | 0.317633 | 0.419424 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 111 | 0.097234 | 0.377120 | 0.186567 | 1.000000 | 0.666667 | 0.513930 | 0.000000 | 0.500000 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 1907 | 0.170159 | 0.571592 | 0.290788 | 0.919524 | 0.368670 | 0.319601 | 0.472982 | 0.483959 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 1165 | 0.151198 | 0.677832 | 0.149022 | 0.762214 | 1.000000 | 0.500000 | 0.666667 | 0.019190 | 1.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 |
| 1510 | 0.275592 | 0.501898 | 0.339190 | 0.564834 | 0.564549 | 0.803345 | 0.292096 | 0.145449 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1423 | 0.325755 | 0.429383 | 0.268657 | 0.500000 | 0.666667 | 0.319165 | 0.111526 | 0.304506 | 1.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 |
| 374 | 0.272001 | 0.622642 | 0.253731 | 0.638601 | 0.666667 | 0.500000 | 1.000000 | 0.500000 | 1.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
| 285 | 0.228120 | 0.519975 | 0.324479 | 0.856373 | 0.617892 | 0.263157 | 0.333333 | 0.086833 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 194 | 0.180618 | 0.476791 | 0.097015 | 1.000000 | 0.600000 | 0.500000 | 0.666667 | 0.370921 | 1.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
20 rows × 38 columns
def get_estimator_name(estimator):
return str(type(estimator)).split(".")[-1][:-2]
estimators = [
BayesianRidge(),
RandomForestRegressor(),
KNeighborsRegressor(),
]
for estimator in estimators:
logging.info(f"Iterative Imputation with {get_estimator_name(estimator)}")
cat_pipe = Pipeline(
steps=[
("imputer", IterativeImputer(add_indicator=True, estimator=estimator, random_state=42, n_nearest_features=1, sample_posterior=False, initial_strategy="most_frequent", max_iter=20))
]
)
clf, pre = end2end_pipeline(
num_imputer=IterativeImputer(
add_indicator=True,
random_state=42,
n_nearest_features=1,
sample_posterior=False,
initial_strategy="mean",
estimator=estimator,
max_iter=20
),
cat_imputer=cat_pipe
)
mean_f1, std_f1 = score_fn_cv(df=df.copy(), clf=clf, target="WeightClass")
scores[f"Iterative Imputation with {get_estimator_name(estimator)}"] = mean_f1
stds[f"Iterative Imputation with {get_estimator_name(estimator)}"] = std_f1
logging.info(f"Iterative Imputation with {get_estimator_name(estimator)}: F1 score = {100*scores[f'Iterative Imputation with {get_estimator_name(estimator)}']:.2f}%")
clf
2024-04-23T15:43:08+0300 INFO: Iterative Imputation with BayesianRidge 2024-04-23T15:43:13+0300 INFO: Iterative Imputation with BayesianRidge: F1 score = 67.64% 2024-04-23T15:43:13+0300 INFO: Iterative Imputation with RandomForestRegressor 2024-04-23T15:51:40+0300 INFO: Iterative Imputation with RandomForestRegressor: F1 score = 66.08% 2024-04-23T15:51:40+0300 INFO: Iterative Imputation with KNeighborsRegressor 2024-04-23T15:51:56+0300 INFO: Iterative Imputation with KNeighborsRegressor: F1 score = 67.88%
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder',
MinMaxScaler()),
('imputer',
IterativeImputer(add_indicator=True,
estimator=KNeighborsRegressor(),
max_iter=20,
n_nearest_features=1,
random_state=42))]),
['Age', 'Height', 'Weight',
'FCVC', 'NCP', 'CH2O', 'FAF',
'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
Ordi...
Pipeline(steps=[('imputer',
IterativeImputer(add_indicator=True,
estimator=KNeighborsRegressor(),
initial_strategy='most_frequent',
max_iter=20,
n_nearest_features=1,
random_state=42))])),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC',
'CAEC', 'SMOKE', 'SCC',
'CALC', 'MTRANS'])])),
('classifier', LogisticRegression(max_iter=2000))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder',
MinMaxScaler()),
('imputer',
IterativeImputer(add_indicator=True,
estimator=KNeighborsRegressor(),
max_iter=20,
n_nearest_features=1,
random_state=42))]),
['Age', 'Height', 'Weight',
'FCVC', 'NCP', 'CH2O', 'FAF',
'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
Ordi...
Pipeline(steps=[('imputer',
IterativeImputer(add_indicator=True,
estimator=KNeighborsRegressor(),
initial_strategy='most_frequent',
max_iter=20,
n_nearest_features=1,
random_state=42))])),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC',
'CAEC', 'SMOKE', 'SCC',
'CALC', 'MTRANS'])])),
('classifier', LogisticRegression(max_iter=2000))])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder', MinMaxScaler()),
('imputer',
IterativeImputer(add_indicator=True,
estimator=KNeighborsRegressor(),
max_iter=20,
n_nearest_features=1,
random_state=42))]),
['Age', 'Height', 'Weight', 'FCVC', 'NCP',
'CH2O', 'FAF', 'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_e..._value',
unknown_value=nan)),
('imputer',
Pipeline(steps=[('imputer',
IterativeImputer(add_indicator=True,
estimator=KNeighborsRegressor(),
initial_strategy='most_frequent',
max_iter=20,
n_nearest_features=1,
random_state=42))])),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE',
'SCC', 'CALC', 'MTRANS'])])['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
MinMaxScaler()
IterativeImputer(add_indicator=True, estimator=KNeighborsRegressor(),
max_iter=20, n_nearest_features=1, random_state=42)KNeighborsRegressor()
KNeighborsRegressor()
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=nan)
Pipeline(steps=[('imputer',
IterativeImputer(add_indicator=True,
estimator=KNeighborsRegressor(),
initial_strategy='most_frequent', max_iter=20,
n_nearest_features=1, random_state=42))])IterativeImputer(add_indicator=True, estimator=KNeighborsRegressor(),
initial_strategy='most_frequent', max_iter=20,
n_nearest_features=1, random_state=42)KNeighborsRegressor()
KNeighborsRegressor()
OneHotEncoder(drop='first', handle_unknown='ignore')
LogisticRegression(max_iter=2000)
pd.DataFrame(preprocessor.fit_transform(df.drop("WeightClass", axis=1)), columns=preprocessor.get_feature_names_out()).sample(20)
| num__Age | num__Height | num__Weight | num__FCVC | num__NCP | num__CH2O | num__FAF | num__TUE | num__missingindicator_Age | num__missingindicator_Height | ... | cat__MTRANS_2.0 | cat__MTRANS_3.0 | cat__MTRANS_4.0 | cat__missingindicator_FHO_1.0 | cat__missingindicator_FAVC_1.0 | cat__missingindicator_CAEC_1.0 | cat__missingindicator_SMOKE_1.0 | cat__missingindicator_SCC_1.0 | cat__missingindicator_CALC_1.0 | cat__missingindicator_MTRANS_1.0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1229 | 0.291366 | 0.403079 | 0.493449 | 1.000000 | 0.666667 | 0.313234 | 0.003394 | 0.206403 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 293 | 0.291702 | 0.364183 | 0.465441 | 1.000000 | 0.573141 | 0.816249 | 0.120098 | 0.070396 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 27 | 0.236711 | 0.439941 | 0.344282 | 0.501038 | 0.666667 | 0.669186 | 0.299187 | 0.040578 | 1.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1834 | 0.194468 | 0.425370 | 0.473309 | 1.000000 | 0.666667 | 0.907647 | 0.398935 | 0.500000 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| 838 | 0.297918 | 0.566038 | 0.604478 | 0.500000 | 0.666667 | 0.500000 | 0.666667 | 0.303967 | 1.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 |
| 788 | 0.542330 | 0.474334 | 0.328358 | 0.686735 | 0.377448 | 0.500000 | 0.490724 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 1662 | 0.656164 | 0.222181 | 0.302191 | 0.500000 | 0.666667 | 0.976253 | 0.137641 | 0.290355 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 54 | 0.115976 | 0.450305 | 0.707464 | 1.000000 | 0.666667 | 0.934617 | 0.488644 | 0.375374 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 |
| 1612 | 0.166479 | 0.521898 | 0.650747 | 1.000000 | 0.666667 | 0.379676 | 0.535262 | 0.475219 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1942 | 0.218776 | 0.345782 | 0.286970 | 0.500000 | 0.440275 | 0.328627 | 0.008714 | 0.000000 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 241 | 0.643356 | 0.205670 | 0.342095 | 0.500000 | 0.666667 | 0.286185 | 0.174306 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
| 180 | 0.139144 | 0.537419 | 0.082090 | 0.732316 | 0.597775 | 0.349666 | 0.434659 | 0.399250 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 1655 | 0.262093 | 0.431722 | 0.266258 | 1.000000 | 0.579223 | 0.500000 | 0.000000 | 0.000000 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1441 | 0.145851 | 0.245283 | 0.243745 | 0.500000 | 1.000000 | 0.546642 | 0.333333 | 0.500000 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 786 | 0.214271 | 0.369636 | 0.273794 | 0.500000 | 0.511117 | 0.500000 | 0.030781 | 0.324490 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| 68 | 0.097234 | 0.425965 | 0.242728 | 0.500000 | 0.666667 | 0.500000 | 0.333333 | 0.000000 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 282 | 0.291702 | 0.390010 | 0.522904 | 1.000000 | 0.666667 | 0.764757 | 0.000000 | 0.069281 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 153 | 0.167240 | 0.454202 | 0.208605 | 0.106454 | 0.753311 | 0.014852 | 0.666667 | 0.500000 | 1.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 716 | 0.197882 | 0.484150 | 0.907757 | 1.000000 | 0.650810 | 0.702025 | 0.336655 | 0.333021 | 1.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1620 | 0.170159 | 0.443420 | 0.268657 | 0.451826 | 0.666667 | 0.000000 | 0.330340 | 0.000000 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
20 rows × 38 columns
scores
{'Simple Imputation Mean/Mod': 0.6845058429408535,
'Simple Imputation Mean/Mod with missingness': 0.7015540093890401,
'KNN Imputation': 0.7096075201953995,
'Iterative Imputation with BayesianRidge': 0.6764467300819021,
'Iterative Imputation with RandomForestRegressor': 0.6608203645815827,
'Iterative Imputation with KNeighborsRegressor': 0.6788264820230132}
stds
{'Simple Imputation Mean/Mod': 0.0077036062356310425,
'Simple Imputation Mean/Mod with missingness': 0.012472904162187038,
'KNN Imputation': 0.013628848459331648,
'Iterative Imputation with BayesianRidge': 0.017169157550399414,
'Iterative Imputation with RandomForestRegressor': 0.012849014958810835,
'Iterative Imputation with KNeighborsRegressor': 0.006213508848002439}
scores_df = pd.DataFrame()
scores_df["means"] = list(scores.values())
scores_df["stds"] = list(stds.values())
scores_df *= 100
scores_df.index = list(scores.keys())
scores_df[["means"]].plot.barh(xerr=scores_df["stds"], color='steelblue');
plt.title("Comparision of imputation methods on F1 score");
plt.xlabel("F1 Score, the higher the better");
plt.xlim(0, 100);
# Add annotations
for i, v in enumerate(scores_df["means"]):
if v == scores_df["means"].min():
color = "red"
elif v == scores_df["means"].max():
color="green"
else:
color = "black"
plt.text(v+1, i, " "+str(round(v,2)) + "%", color=color, va='center');
Summary
Simple Imputation Mean/Mod: Missing numerical features are replaced by the mean while missing categorical features are replaced by the mode.
Simple Imputataion with Missing: In addition to replacing with mean/mode, an extra feature is added to indicate the missingness of the feature.
KNN Imputation: Nearest neighbors imputation. For numerical features, n_neighbors of 15 is used. For categorical features, nearest neighbors is preceded by ordinal encoding which is a requirement of the sklearn KNNImputer. In case of categorical features, the n_neighbors parameter is set to 1 to avoid averaging categorical values. Another alternative would be to use a custom distance metric that for 2 1-D arrays returns a distance proportional to the frequency of the mode of the categories. With such metric, we can safely try out n_neighbors>1
Iterative Imputation: Numerical and categorical features are treated differently. Similar setting as to KNN imputation is used. The strategy to initiliaze numerical features is set to mean and to mode for categorical features.
KNN Imputation has the highest score of imputing the missing values. Iterative sampling in the other hand has the lowest F1 score.
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV, KFold
We will reuse the pipeline of KNNNearest Neighbor imputation introduced in KNN Imputation to build on as it yielded the highest performance.
plt.pie(df['WeightClass'].value_counts(), labels=df["WeightClass"].value_counts().index,autopct='%.0f%%', colors=["skyblue", "steelblue", "lightblue", "blue"])
plt.title('Weight Class Distribution among respondents');
Obese respondents are 46%, followed by overweight 27%, normal weight 14% and underweight 13%.
#Defining bar chart function
def bar_plot(feature, df=df, target="WeightClass"):
sns.countplot(x=target, hue=feature, data=df, palette = "Blues_r")
plt.title(f"{feature} vs {target}")
return plt.show()
bar_plot("Gender")
2024-04-23T15:54:38+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2024-04-23T15:54:38+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
fig = px.bar(df["Gender"].value_counts(), labels={'index': 'Gender', 'value': 'Count'}, color=["Male", "Female"], color_discrete_sequence=["steelblue","skyblue"], title='Bar Plot of Gender')
fig.show()
Men tend to be slightly more obese than women. Females tend to be more underweight compared to males. There are almost equal number of male and female respondents.
bar_plot("FHO")
2024-04-23T15:54:38+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2024-04-23T15:54:38+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
The more obesity history there is in the family, the more likely it is for respondent to be obese.
bar_plot("FAVC")
2024-04-23T15:54:39+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2024-04-23T15:54:39+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Every weight class consumes high caloric foods but the obese and overweight people tend to consume more than normal and under weights.
bar_plot("CAEC")
2024-04-23T15:54:39+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2024-04-23T15:54:39+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
First we can see that every weight class have foods between meals. People who have food between meals sometimes are the ones more likely to be obese. Meanwhile, People who always have meals between meals tend to be of normal weight- It would be interesting to simplify the feature into binary yes or no.
tmp = pd.DataFrame()
tmp["CAEC"] = df["CAEC"].apply(lambda x: 'no' if x == 'no' else 'yes')
tmp["WeightClass"] = df["WeightClass"]
bar_plot("CAEC", tmp)
2024-04-23T15:54:41+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2024-04-23T15:54:41+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
WIth the simplified option, we clearly see that the obese have more food consumption between meals while normal and underweight classes have almost equals food consumption between meals.
bar_plot("SMOKE")
2024-04-23T15:54:42+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2024-04-23T15:54:42+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
df["SMOKE"].value_counts().plot(kind="bar");
plt.title("Smoke habit in respondents");
plt.ylabel("Count");
Very small number of people smoke. And smokers can belong to the normal weight class as much as they can be obese.
bar_plot("SCC")
2024-04-23T15:54:43+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2024-04-23T15:54:43+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
Respondents who monitor their calorie usage are more likely to be in less dangerous weight classes. There are more people who do not monitor their calorie.
bar_plot("CALC")
2024-04-23T15:54:44+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2024-04-23T15:54:44+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
People who drink alcohol sometimes are dominant within respondents and could face more weight issues than others. However, there are obese people who don't drink alcohol at all, all this to say that not drinking alcohol is not a sure garantee to obesity.
bar_plot("MTRANS")
2024-04-23T15:54:44+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2024-04-23T15:54:44+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
tmp = pd.DataFrame()
tmp["ACTIVE"] = df["MTRANS"].apply(lambda x: 'yes' if x in ['Walking', 'Bike'] else 'no')
tmp["WeightClass"] = df["WeightClass"]
bar_plot("ACTIVE", tmp)
2024-04-23T15:54:45+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2024-04-23T15:54:45+0300 INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
There don't seem to be any relation between transportation means and weight class. People who use public transport and automabile as well as motorbike can be of overweight and obese classes.
def hist(feature, nbins=7, df=df, target="WeightClass"):
feature_group = df.groupby([feature, target]).agg({target: 'count'})
feature_group = feature_group.rename(columns={target: 'Count'})
feature_group = feature_group.reset_index().sort_values(by=target, ascending=True)
fig = px.histogram(feature_group, x=feature, y=target, color=target, barmode='group', marginal='box', nbins=nbins, title=f'{target} = F({feature})')
return fig.show()
hist("Age", 6)
hist("Height", 4)
hist("Weight", 4)
NUM_FEATURES
['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
df[NUM_FEATURES].hist(figsize=(20,20), color='skyblue')
plt.show()
df[["WeightClass"]].hist()
array([[<AxesSubplot:title={'center':'WeightClass'}>]], dtype=object)
px.imshow(df.corr(), width=700, height=500,color_continuous_scale='blues', title='Correlation between variables')
CAT_FEATURES
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
def end2end_preprocessing(num_imputer, cat_imputer, num_features=NUM_FEATURES, cat_features=CAT_FEATURES):
numerical_transformer = Pipeline(
steps=[('encoder', MinMaxScaler()), ("imputer", num_imputer)]
)
categorical_transformer = Pipeline(
steps=[("ordinal_encode", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)), ("imputer", cat_imputer), ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))]
)
#
preprocessor = ColumnTransformer(transformers=[
('num', numerical_transformer, num_features),
('cat', categorical_transformer, cat_features)
])
return preprocessor
del preprocessor
preprocessor = end2end_preprocessing(
num_imputer=KNNImputer(n_neighbors=15, add_indicator=True),
cat_imputer=KNNImputer(n_neighbors=1, add_indicator=True)
)
X_train, X_test, y_train, y_test = train_test_split(df.drop("WeightClass", axis=1), df["WeightClass"], stratify=df["WeightClass"], test_size=0.2, random_state=42)
preprocessor
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder', MinMaxScaler()),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=15))]),
['Age', 'Height', 'Weight', 'FCVC', 'NCP',
'CH2O', 'FAF', 'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=1)),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE',
'SCC', 'CALC', 'MTRANS'])])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder', MinMaxScaler()),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=15))]),
['Age', 'Height', 'Weight', 'FCVC', 'NCP',
'CH2O', 'FAF', 'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=1)),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE',
'SCC', 'CALC', 'MTRANS'])])['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
MinMaxScaler()
KNNImputer(add_indicator=True, n_neighbors=15)
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=nan)
KNNImputer(add_indicator=True, n_neighbors=1)
OneHotEncoder(drop='first', handle_unknown='ignore')
rfe = Pipeline(
[
("preprocess", preprocessor),
("classifier", RandomForestClassifier(n_jobs=4, random_state=42)),
]
)
rfe
Pipeline(steps=[('preprocess',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder',
MinMaxScaler()),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=15))]),
['Age', 'Height', 'Weight',
'FCVC', 'NCP', 'CH2O', 'FAF',
'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=1)),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC',
'CAEC', 'SMOKE', 'SCC',
'CALC', 'MTRANS'])])),
('classifier',
RandomForestClassifier(n_jobs=4, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocess',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder',
MinMaxScaler()),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=15))]),
['Age', 'Height', 'Weight',
'FCVC', 'NCP', 'CH2O', 'FAF',
'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=1)),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC',
'CAEC', 'SMOKE', 'SCC',
'CALC', 'MTRANS'])])),
('classifier',
RandomForestClassifier(n_jobs=4, random_state=42))])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder', MinMaxScaler()),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=15))]),
['Age', 'Height', 'Weight', 'FCVC', 'NCP',
'CH2O', 'FAF', 'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=1)),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE',
'SCC', 'CALC', 'MTRANS'])])['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
MinMaxScaler()
KNNImputer(add_indicator=True, n_neighbors=15)
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=nan)
KNNImputer(add_indicator=True, n_neighbors=1)
OneHotEncoder(drop='first', handle_unknown='ignore')
RandomForestClassifier(n_jobs=4, random_state=42)
param_grids = {
"classifier__n_estimators": [10, 20, 50, 100],
"classifier__criterion": ["gini", "entropy", "log_loss"],
"classifier__max_depth": [3, 9, 15, 20]
}
cv = StratifiedKFold(n_splits=N_SPLITS, random_state=42, shuffle=True)
grid_search = GridSearchCV(
estimator=rfe,
param_grid=param_grids,
return_train_score=True,
cv=cv,
).fit(X_train, y_train)
result = pd.DataFrame(grid_search.cv_results_)
result
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_classifier__criterion | param_classifier__max_depth | param_classifier__n_estimators | params | split0_test_score | split1_test_score | ... | mean_test_score | std_test_score | rank_test_score | split0_train_score | split1_train_score | split2_train_score | split3_train_score | split4_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.485035 | 0.009598 | 0.147497 | 0.023930 | gini | 3 | 10 | {'classifier__criterion': 'gini', 'classifier_... | 0.573964 | 0.642012 | ... | 0.602492 | 0.023414 | 47 | 0.622963 | 0.660741 | 0.614815 | 0.625463 | 0.606218 | 0.626040 | 0.018617 |
| 1 | 0.526904 | 0.020175 | 0.139894 | 0.013999 | gini | 3 | 20 | {'classifier__criterion': 'gini', 'classifier_... | 0.571006 | 0.606509 | ... | 0.601326 | 0.019867 | 48 | 0.626667 | 0.633333 | 0.615556 | 0.628423 | 0.612139 | 0.623224 | 0.008034 |
| 2 | 0.645878 | 0.035050 | 0.173447 | 0.018895 | gini | 3 | 50 | {'classifier__criterion': 'gini', 'classifier_... | 0.591716 | 0.656805 | ... | 0.630941 | 0.026078 | 39 | 0.660000 | 0.652593 | 0.640741 | 0.655070 | 0.652850 | 0.652251 | 0.006340 |
| 3 | 0.777626 | 0.024266 | 0.177339 | 0.015063 | gini | 3 | 100 | {'classifier__criterion': 'gini', 'classifier_... | 0.591716 | 0.636095 | ... | 0.626799 | 0.027309 | 42 | 0.645185 | 0.631852 | 0.634074 | 0.683198 | 0.642487 | 0.647359 | 0.018600 |
| 4 | 0.454392 | 0.015006 | 0.120792 | 0.008454 | gini | 9 | 10 | {'classifier__criterion': 'gini', 'classifier_... | 0.733728 | 0.766272 | ... | 0.749414 | 0.012639 | 33 | 0.922963 | 0.930370 | 0.925926 | 0.940785 | 0.937084 | 0.931426 | 0.006672 |
| 5 | 0.485814 | 0.012279 | 0.139335 | 0.017685 | gini | 9 | 20 | {'classifier__criterion': 'gini', 'classifier_... | 0.733728 | 0.772189 | ... | 0.770750 | 0.020396 | 27 | 0.950370 | 0.951111 | 0.947407 | 0.957069 | 0.954108 | 0.952013 | 0.003307 |
| 6 | 0.584069 | 0.007341 | 0.155558 | 0.008577 | gini | 9 | 50 | {'classifier__criterion': 'gini', 'classifier_... | 0.781065 | 0.781065 | ... | 0.786724 | 0.012377 | 22 | 0.954815 | 0.962222 | 0.954074 | 0.957809 | 0.962250 | 0.958234 | 0.003499 |
| 7 | 0.797730 | 0.022623 | 0.188612 | 0.025899 | gini | 9 | 100 | {'classifier__criterion': 'gini', 'classifier_... | 0.798817 | 0.775148 | ... | 0.794436 | 0.011245 | 19 | 0.963704 | 0.960741 | 0.958519 | 0.959289 | 0.966691 | 0.961789 | 0.003025 |
| 8 | 0.448755 | 0.006180 | 0.116383 | 0.007896 | gini | 15 | 10 | {'classifier__criterion': 'gini', 'classifier_... | 0.766272 | 0.784024 | ... | 0.768372 | 0.014550 | 28 | 0.990370 | 0.996296 | 0.992593 | 0.994819 | 0.993338 | 0.993483 | 0.002010 |
| 9 | 0.485313 | 0.015920 | 0.138020 | 0.012366 | gini | 15 | 20 | {'classifier__criterion': 'gini', 'classifier_... | 0.786982 | 0.775148 | ... | 0.783162 | 0.015902 | 23 | 0.998519 | 0.998519 | 0.997778 | 1.000000 | 0.997779 | 0.998519 | 0.000811 |
| 10 | 0.595568 | 0.022580 | 0.168810 | 0.007819 | gini | 15 | 50 | {'classifier__criterion': 'gini', 'classifier_... | 0.789941 | 0.798817 | ... | 0.810435 | 0.018558 | 10 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 11 | 0.841035 | 0.030858 | 0.198968 | 0.015932 | gini | 15 | 100 | {'classifier__criterion': 'gini', 'classifier_... | 0.798817 | 0.819527 | ... | 0.813390 | 0.012830 | 7 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 12 | 0.452309 | 0.008657 | 0.121316 | 0.008526 | gini | 20 | 10 | {'classifier__criterion': 'gini', 'classifier_... | 0.760355 | 0.730769 | ... | 0.748810 | 0.011990 | 34 | 0.993333 | 0.996296 | 0.992593 | 0.994819 | 0.997039 | 0.994816 | 0.001690 |
| 13 | 0.463184 | 0.017230 | 0.118539 | 0.005638 | gini | 20 | 20 | {'classifier__criterion': 'gini', 'classifier_... | 0.763314 | 0.786982 | ... | 0.777841 | 0.009953 | 24 | 1.000000 | 1.000000 | 1.000000 | 0.999260 | 1.000000 | 0.999852 | 0.000296 |
| 14 | 0.548115 | 0.028406 | 0.140276 | 0.010263 | gini | 20 | 50 | {'classifier__criterion': 'gini', 'classifier_... | 0.786982 | 0.804734 | ... | 0.796200 | 0.014801 | 16 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 15 | 0.778350 | 0.048602 | 0.177143 | 0.022403 | gini | 20 | 100 | {'classifier__criterion': 'gini', 'classifier_... | 0.798817 | 0.819527 | ... | 0.810430 | 0.010205 | 11 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 16 | 0.458473 | 0.039886 | 0.134288 | 0.009049 | entropy | 3 | 10 | {'classifier__criterion': 'entropy', 'classifi... | 0.556213 | 0.618343 | ... | 0.608440 | 0.031071 | 43 | 0.620741 | 0.641481 | 0.610370 | 0.640266 | 0.601036 | 0.622779 | 0.016040 |
| 17 | 0.515209 | 0.033711 | 0.145203 | 0.020477 | entropy | 3 | 20 | {'classifier__criterion': 'entropy', 'classifi... | 0.573964 | 0.612426 | ... | 0.607252 | 0.020617 | 45 | 0.628889 | 0.630370 | 0.608148 | 0.621021 | 0.613620 | 0.620410 | 0.008578 |
| 18 | 0.577494 | 0.012899 | 0.169026 | 0.014362 | entropy | 3 | 50 | {'classifier__criterion': 'entropy', 'classifi... | 0.606509 | 0.642012 | ... | 0.633319 | 0.022179 | 37 | 0.658519 | 0.649630 | 0.646667 | 0.678016 | 0.646188 | 0.655804 | 0.011957 |
| 19 | 0.780709 | 0.027696 | 0.191107 | 0.011373 | entropy | 3 | 100 | {'classifier__criterion': 'entropy', 'classifi... | 0.582840 | 0.636095 | ... | 0.627999 | 0.033827 | 40 | 0.641481 | 0.637778 | 0.637778 | 0.693560 | 0.638786 | 0.649877 | 0.021884 |
| 20 | 0.453015 | 0.023241 | 0.127255 | 0.009269 | entropy | 9 | 10 | {'classifier__criterion': 'entropy', 'classifi... | 0.754438 | 0.707101 | ... | 0.744084 | 0.025169 | 35 | 0.928889 | 0.930370 | 0.951852 | 0.930422 | 0.945226 | 0.937352 | 0.009388 |
| 21 | 0.477226 | 0.019093 | 0.129570 | 0.024728 | entropy | 9 | 20 | {'classifier__criterion': 'entropy', 'classifi... | 0.745562 | 0.745562 | ... | 0.766012 | 0.020157 | 29 | 0.957037 | 0.960000 | 0.953333 | 0.954848 | 0.960770 | 0.957198 | 0.002867 |
| 22 | 0.596847 | 0.020224 | 0.151270 | 0.009244 | entropy | 9 | 50 | {'classifier__criterion': 'entropy', 'classifi... | 0.789941 | 0.801775 | ... | 0.799772 | 0.008079 | 14 | 0.969630 | 0.971852 | 0.967407 | 0.963731 | 0.974093 | 0.969343 | 0.003583 |
| 23 | 0.823978 | 0.010875 | 0.182342 | 0.009519 | entropy | 9 | 100 | {'classifier__criterion': 'entropy', 'classifi... | 0.813609 | 0.804734 | ... | 0.803320 | 0.010966 | 12 | 0.973333 | 0.970370 | 0.968148 | 0.972613 | 0.971132 | 0.971119 | 0.001818 |
| 24 | 0.443420 | 0.018316 | 0.113017 | 0.006287 | entropy | 15 | 10 | {'classifier__criterion': 'entropy', 'classifi... | 0.772189 | 0.763314 | ... | 0.776665 | 0.010227 | 25 | 0.991111 | 0.994815 | 0.993333 | 0.992598 | 0.995559 | 0.993483 | 0.001582 |
| 25 | 0.486811 | 0.008389 | 0.138677 | 0.011558 | entropy | 15 | 20 | {'classifier__criterion': 'entropy', 'classifi... | 0.804734 | 0.778107 | ... | 0.794439 | 0.010364 | 17 | 1.000000 | 0.997037 | 0.998519 | 0.998520 | 1.000000 | 0.998815 | 0.001109 |
| 26 | 0.624152 | 0.020824 | 0.171189 | 0.011428 | entropy | 15 | 50 | {'classifier__criterion': 'entropy', 'classifi... | 0.795858 | 0.825444 | ... | 0.812205 | 0.012064 | 8 | 1.000000 | 1.000000 | 0.999259 | 1.000000 | 0.999260 | 0.999704 | 0.000363 |
| 27 | 0.827072 | 0.022144 | 0.219544 | 0.020858 | entropy | 15 | 100 | {'classifier__criterion': 'entropy', 'classifi... | 0.813609 | 0.828402 | ... | 0.819895 | 0.015508 | 1 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 28 | 0.470915 | 0.014403 | 0.122763 | 0.017876 | entropy | 20 | 10 | {'classifier__criterion': 'entropy', 'classifi... | 0.760355 | 0.745562 | ... | 0.756531 | 0.016388 | 31 | 0.996296 | 0.995556 | 0.991852 | 0.997039 | 0.994819 | 0.995112 | 0.001790 |
| 29 | 0.487367 | 0.036971 | 0.143179 | 0.019807 | entropy | 20 | 20 | {'classifier__criterion': 'entropy', 'classifi... | 0.786982 | 0.792899 | ... | 0.788515 | 0.012800 | 20 | 1.000000 | 1.000000 | 1.000000 | 0.999260 | 1.000000 | 0.999852 | 0.000296 |
| 30 | 0.613159 | 0.024992 | 0.154289 | 0.025237 | entropy | 20 | 50 | {'classifier__criterion': 'entropy', 'classifi... | 0.795858 | 0.831361 | ... | 0.816954 | 0.014285 | 3 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 31 | 0.830111 | 0.031568 | 0.193728 | 0.019888 | entropy | 20 | 100 | {'classifier__criterion': 'entropy', 'classifi... | 0.801775 | 0.828402 | ... | 0.815167 | 0.012639 | 5 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 32 | 0.446890 | 0.018891 | 0.132769 | 0.016630 | log_loss | 3 | 10 | {'classifier__criterion': 'log_loss', 'classif... | 0.556213 | 0.618343 | ... | 0.608440 | 0.031071 | 43 | 0.620741 | 0.641481 | 0.610370 | 0.640266 | 0.601036 | 0.622779 | 0.016040 |
| 33 | 0.490058 | 0.024082 | 0.130067 | 0.011621 | log_loss | 3 | 20 | {'classifier__criterion': 'log_loss', 'classif... | 0.573964 | 0.612426 | ... | 0.607252 | 0.020617 | 45 | 0.628889 | 0.630370 | 0.608148 | 0.621021 | 0.613620 | 0.620410 | 0.008578 |
| 34 | 0.562501 | 0.007835 | 0.161226 | 0.011303 | log_loss | 3 | 50 | {'classifier__criterion': 'log_loss', 'classif... | 0.606509 | 0.642012 | ... | 0.633319 | 0.022179 | 37 | 0.658519 | 0.649630 | 0.646667 | 0.678016 | 0.646188 | 0.655804 | 0.011957 |
| 35 | 0.749549 | 0.027882 | 0.184486 | 0.016143 | log_loss | 3 | 100 | {'classifier__criterion': 'log_loss', 'classif... | 0.582840 | 0.636095 | ... | 0.627999 | 0.033827 | 40 | 0.641481 | 0.637778 | 0.637778 | 0.693560 | 0.638786 | 0.649877 | 0.021884 |
| 36 | 0.466345 | 0.021004 | 0.140502 | 0.026115 | log_loss | 9 | 10 | {'classifier__criterion': 'log_loss', 'classif... | 0.754438 | 0.707101 | ... | 0.744084 | 0.025169 | 35 | 0.928889 | 0.930370 | 0.951852 | 0.930422 | 0.945226 | 0.937352 | 0.009388 |
| 37 | 0.513768 | 0.029720 | 0.151664 | 0.007223 | log_loss | 9 | 20 | {'classifier__criterion': 'log_loss', 'classif... | 0.745562 | 0.745562 | ... | 0.766012 | 0.020157 | 29 | 0.957037 | 0.960000 | 0.953333 | 0.954848 | 0.960770 | 0.957198 | 0.002867 |
| 38 | 0.616331 | 0.019435 | 0.166181 | 0.020030 | log_loss | 9 | 50 | {'classifier__criterion': 'log_loss', 'classif... | 0.789941 | 0.801775 | ... | 0.799772 | 0.008079 | 14 | 0.969630 | 0.971852 | 0.967407 | 0.963731 | 0.974093 | 0.969343 | 0.003583 |
| 39 | 0.842297 | 0.043482 | 0.192046 | 0.024481 | log_loss | 9 | 100 | {'classifier__criterion': 'log_loss', 'classif... | 0.813609 | 0.804734 | ... | 0.803320 | 0.010966 | 12 | 0.973333 | 0.970370 | 0.968148 | 0.972613 | 0.971132 | 0.971119 | 0.001818 |
| 40 | 0.456873 | 0.008268 | 0.137343 | 0.018367 | log_loss | 15 | 10 | {'classifier__criterion': 'log_loss', 'classif... | 0.772189 | 0.763314 | ... | 0.776665 | 0.010227 | 25 | 0.991111 | 0.994815 | 0.993333 | 0.992598 | 0.995559 | 0.993483 | 0.001582 |
| 41 | 0.489485 | 0.014697 | 0.143608 | 0.017063 | log_loss | 15 | 20 | {'classifier__criterion': 'log_loss', 'classif... | 0.804734 | 0.778107 | ... | 0.794439 | 0.010364 | 17 | 1.000000 | 0.997037 | 0.998519 | 0.998520 | 1.000000 | 0.998815 | 0.001109 |
| 42 | 0.631733 | 0.019557 | 0.179697 | 0.007794 | log_loss | 15 | 50 | {'classifier__criterion': 'log_loss', 'classif... | 0.795858 | 0.825444 | ... | 0.812205 | 0.012064 | 8 | 1.000000 | 1.000000 | 0.999259 | 1.000000 | 0.999260 | 0.999704 | 0.000363 |
| 43 | 0.884931 | 0.015797 | 0.194441 | 0.008474 | log_loss | 15 | 100 | {'classifier__criterion': 'log_loss', 'classif... | 0.813609 | 0.828402 | ... | 0.819895 | 0.015508 | 1 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 44 | 0.479005 | 0.028642 | 0.122950 | 0.005631 | log_loss | 20 | 10 | {'classifier__criterion': 'log_loss', 'classif... | 0.760355 | 0.745562 | ... | 0.756531 | 0.016388 | 31 | 0.996296 | 0.995556 | 0.991852 | 0.997039 | 0.994819 | 0.995112 | 0.001790 |
| 45 | 0.501150 | 0.014953 | 0.155046 | 0.011511 | log_loss | 20 | 20 | {'classifier__criterion': 'log_loss', 'classif... | 0.786982 | 0.792899 | ... | 0.788515 | 0.012800 | 20 | 1.000000 | 1.000000 | 1.000000 | 0.999260 | 1.000000 | 0.999852 | 0.000296 |
| 46 | 0.632020 | 0.029196 | 0.164375 | 0.013576 | log_loss | 20 | 50 | {'classifier__criterion': 'log_loss', 'classif... | 0.795858 | 0.831361 | ... | 0.816954 | 0.014285 | 3 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 47 | 0.869727 | 0.021405 | 0.194527 | 0.019569 | log_loss | 20 | 100 | {'classifier__criterion': 'log_loss', 'classif... | 0.801775 | 0.828402 | ... | 0.815167 | 0.012639 | 5 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
48 rows × 23 columns
grid_search.best_params_
{'classifier__criterion': 'entropy',
'classifier__max_depth': 15,
'classifier__n_estimators': 100}
rfe_model = Pipeline(
[
("preprocess", preprocessor),
("classifier", RandomForestClassifier
(
criterion=grid_search.best_params_["classifier__criterion"],
max_depth=grid_search.best_params_["classifier__max_depth"],
n_estimators=grid_search.best_params_["classifier__n_estimators"], n_jobs=8, random_state=42)
),
]
)
rfe_model.fit(X_train, y_train)
Pipeline(steps=[('preprocess',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder',
MinMaxScaler()),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=15))]),
['Age', 'Height', 'Weight',
'FCVC', 'NCP', 'CH2O', 'FAF',
'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=1)),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC',
'CAEC', 'SMOKE', 'SCC',
'CALC', 'MTRANS'])])),
('classifier',
RandomForestClassifier(criterion='entropy', max_depth=15,
n_jobs=8, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocess',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder',
MinMaxScaler()),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=15))]),
['Age', 'Height', 'Weight',
'FCVC', 'NCP', 'CH2O', 'FAF',
'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=1)),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC',
'CAEC', 'SMOKE', 'SCC',
'CALC', 'MTRANS'])])),
('classifier',
RandomForestClassifier(criterion='entropy', max_depth=15,
n_jobs=8, random_state=42))])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('encoder', MinMaxScaler()),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=15))]),
['Age', 'Height', 'Weight', 'FCVC', 'NCP',
'CH2O', 'FAF', 'TUE']),
('cat',
Pipeline(steps=[('ordinal_encode',
OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=nan)),
('imputer',
KNNImputer(add_indicator=True,
n_neighbors=1)),
('encoder',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE',
'SCC', 'CALC', 'MTRANS'])])['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
MinMaxScaler()
KNNImputer(add_indicator=True, n_neighbors=15)
['Gender', 'FHO', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=nan)
KNNImputer(add_indicator=True, n_neighbors=1)
OneHotEncoder(drop='first', handle_unknown='ignore')
RandomForestClassifier(criterion='entropy', max_depth=15, n_jobs=8,
random_state=42)y_pred = rfe_model.predict(X_test)
report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))
report
| 1 | 2 | 3 | 4 | accuracy | macro avg | weighted avg | |
|---|---|---|---|---|---|---|---|
| precision | 0.884615 | 0.760870 | 0.796610 | 0.888889 | 0.8487 | 0.832746 | 0.845484 |
| recall | 0.851852 | 0.603448 | 0.810345 | 0.943590 | 0.8487 | 0.802309 | 0.848700 |
| f1-score | 0.867925 | 0.673077 | 0.803419 | 0.915423 | 0.8487 | 0.814961 | 0.845415 |
| support | 54.000000 | 58.000000 | 116.000000 | 195.000000 | 0.8487 | 423.000000 | 423.000000 |
Overall the model can make an accurate prediction 85% of time. The high performing class is obese with an F1 score of 91% while the normal class is underperforms with a F1 score of 67%.
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, annot_kws={'size':10},
cmap=plt.cm.Blues, linewidths=0.2);
tick_marks = np.arange(4)
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, [1,2,3,4], rotation=25)
plt.yticks(tick_marks2, [1,2,3,4], rotation=0)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix on Test data')
plt.show()
Here we can see the most common mistakes are between neighbors classes such as Obese being falsely predicted as overweight and vice-versa.
# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
for i in range(1):
tree = rfe_model[-1].estimators_[i]
dot_data = export_graphviz(tree,
feature_names=preprocessor.get_feature_names_out(),
filled=True,
max_depth=3,
impurity=False,
proportion=False)
graph = graphviz.Source(dot_data)
display(graph)
result = permutation_importance(
rfe_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=8
)
sorted_idx = result.importances_mean.argsort()[::-1]
forest_importances = pd.Series(result.importances_mean[sorted_idx], index=X_train.columns[sorted_idx])
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std[sorted_idx], ax=ax, color="skyblue", orientation="vertical")
ax.set_title("Feature importances using permutation")
ax.set_ylabel("Decrease importance, the higher the better")
fig.tight_layout()
plt.show()
I conducted a permutation test on test data using a random forest classifier. The idea of permutation test is to break the association of a given feature with its corresponding weight class and then observe the effect on accuracy. The accuracy of important features will decrease.
Weight, food consumption between meals CAEC, Age, Height are the most important features followed by food habits such frequency of of vegetable FCVC or daily consumption of water CH20.
The features with the least importance are Smoking habit and Transportation means. As revealed in data exploration.
Derivate new features such as activiness from MTRANS
Simplify Age into age category
Add BMI as a feature